library(tidyverse)
library(janitor)
library(here)
library(hrbrthemes)
library(skimr)
library(kableExtra)
library(DT)
library(outbreaks)
library(linelist)
library(earlyR)
library(incidence)
library(incidence2)
library(i2extras)
library(projections)
library(epicontacts)
library(outbreaker2)
library(ape)
theme_set(theme_minimal())

Data cleaning

evd_linelist_raw <- here::here("data", "phm_evd_linelist_2017-11-25.xlsx") %>%
  rio::import()

evd_contacts_raw <- here::here("data", "phm_evd_contacts_2017-11-25.xlsx") %>%
  rio::import()

evd_cleaning_rules <- here::here("data", "phm_evd_cleaning_rules.xlsx") %>%
  rio::import()
evd_linelist_raw
## data.frame [50, 6] 
## #case-ID       chr 39e9dc 664549 b4d8aa 51883d 947e40 9aa197
## Date Of Onset  chr 43018 43024 43025 “18// 10/2017” 43028 43028
## Date.of.report chr 43030 43032 “23/10/2017” 22-10-2017 “2017/10/25” “2017-10-23”
## SEX.           chr Female male male male female female
## Age            dbl 62 28 54 57 23 66
## Location       chr Pseudopolis peudopolis Ankh-Morpork PSEUDOPOLIS Ankh Morpork Pseudopolis
evd_contacts_raw
## data.frame [36, 2] 
## Source Case #ID chr 9aa197 39e9dc c2a389 51883d 51883d 933811
## case.id         chr 426b6d a8e9d8 95fc1d 778316 e37897 99abbe
evd_cleaning_rules
## data.frame [6, 3] 
## bad_spelling  chr f m .missing am peudopolis .missing
## good_spelling chr female male unknown ankh_morpork pseudopolis unknown
## variable      chr sex sex sex location location location
evd_linelist <- evd_linelist_raw %>%
  linelist::clean_data(wordlists = evd_cleaning_rules) %>%
  mutate(across(contains("date"), guess_dates))
evd_contacts <- evd_contacts_raw %>%
  linelist::clean_data()
evd_linelist
## data.frame [50, 6] 
## case_id        chr  39e9dc 664549 b4d8aa 51883d 947e40 9aa197
## date_of_onset  date 2017-10-10 2017-10-16 2017-10-17 2017-10-18 2017-10-20 2017-10-20
## date_of_report date 2017-10-22 2017-10-24 2017-10-23 2017-10-22 2017-10-25 2017-10-23
## sex            chr  female male male male female female
## age            dbl  62 28 54 57 23 66
## location       chr  pseudopolis pseudopolis ankh_morpork pseudopolis ankh_morpork pseudopolis
evd_contacts
## data.frame [36, 2] 
## source_case_id chr 9aa197 39e9dc c2a389 51883d 51883d 933811
## case_id        chr 426b6d a8e9d8 95fc1d 778316 e37897 99abbe

Analysis of contacts and transmissibility

Analysis of contact data

evd_chains <- make_epicontacts(
  linelist = evd_linelist,
  contacts = evd_contacts,
  id = "case_id",
  from = "source_case_id",
  to = "case_id",
  directed = TRUE
)

evd_chains
## 
## /// Epidemiological Contacts //
## 
##   // class: epicontacts
##   // 50 cases in linelist; 36 contacts;  directed 
## 
##   // linelist
## 
## tibble [50, 6] 
## id             chr  39e9dc 664549 b4d8aa 51883d 947e40 9aa197
## date_of_onset  date 2017-10-10 2017-10-16 2017-10-17 2017-10-18 2017-10-20 2017-10-20
## date_of_report date 2017-10-22 2017-10-24 2017-10-23 2017-10-22 2017-10-25 2017-10-23
## sex            chr  female male male male female female
## age            dbl  62 28 54 57 23 66
## location       chr  pseudopolis pseudopolis ankh_morpork pseudopolis ankh_morpork pseudopolis 
## 
##   // contacts
## 
## tibble [36, 2] 
## from chr 9aa197 39e9dc c2a389 51883d 51883d 933811
## to   chr 426b6d a8e9d8 95fc1d 778316 e37897 99abbe
plot(
  evd_chains,
  node_color = "location",
  node_shape = "sex",
  shapes = c(male = "male", female = "female")
  )

It seems that individuals from pseudopolis are the main sources of infection, and almost exclusively infect individuals from ank morpork, with very limited tranmission between individuals from the same location.

get_pairwise(evd_chains, attribute = "location", f = table)
##               values.to
## values.from    ankh_morpork pseudopolis
##   ankh_morpork           13           4
##   pseudopolis            16           3
get_pairwise(evd_chains, attribute = "location", f = table) %>%
  chisq.test(simulate = TRUE)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000 replicates)
## 
## data:  .
## X-squared = 0.34315, df = NA, p-value = 0.6837

Tabulating and calculating a test, it appears that there isn’t a pattern of differential infections, just that individuals from ankh morpork are more likely to be infected.

Estimating growth rates from epidemic curves

evd_incidence <- evd_linelist %>%
  incidence2::incidence(
    date_index = "date_of_onset",
    groups = "location"
  )
evd_poisson_fit <- evd_incidence %>%
  i2extras::fit_curve(model = "poisson")
plot(evd_poisson_fit)

evd_poisson_growth <- i2extras::growth_rate(
  evd_poisson_fit,
  growth_decay_time = TRUE
)
min_date <- min(evd_linelist$date_of_onset, na.rm = TRUE) %>%
  format("%B %d, %Y")
max_date <- max(evd_linelist$date_of_onset, na.rm = TRUE) %>%
  format("%B %d, %Y")

evd_poisson_growth %>%
  ggplot(aes(x = r, y = fct_reorder(location, r))) +
  geom_errorbar(aes(xmin = r_lower, xmax = r_upper), width = 0.2) +
  geom_point(color = "#7c1073", size = 3) +
  geom_vline(aes(xintercept = 0), linetype = 2, color = "#3b3b3b") +
  labs(
    title = "Estimated daily growth rate of Ebola in Ankh by region",
    subtitle = glue::glue(
      "based on symptom onset date, {min_date} - {max_date}"
      )
  )

Based on both the epidemic curve and the estimated growth rate CIs, it is not possible to tell if the outbreak is growing or shrinking. There is likely insufficient data to make a reliable estimate of the growth rate.

Estimating time-varying reproduction numbers

evd_si <- evd_chains %>%
  get_pairwise("date_of_onset") %>%
  epitrix::fit_disc_gamma()

evd_si
## $mu
## [1] 13.09018
## 
## $cv
## [1] 0.6489035
## 
## $sd
## [1] 8.494266
## 
## $ll
## [1] -122.525
## 
## $converged
## [1] TRUE
## 
## $distribution
## A discrete distribution
##   name: gamma
##   parameters:
##     shape: 2.37486979332248
##     scale: 5.51195897078819
evd_Rt <- incidence::incidence(evd_linelist$date_of_onset) %>%
  EpiEstim::estimate_R(
    method = "parametric_si",
    config = EpiEstim::make_config(mean_si = evd_si$mu, std_si = evd_si$sd)
  )
## Default config will estimate R on weekly sliding windows.
##     To change this change the t_start and t_end arguments.
## Warning in estimate_R_func(incid = incid, method = method, si_sample = si_sample, : You're estimating R too early in the epidemic to get the desired
##             posterior CV.
plot(evd_Rt)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## Warning: Removed 5 row(s) containing missing values (geom_path).

If nothing changes, it seems likely that the outbreak would continue at a relatively steady rate.

Finding who infected whom

Looking at Whole Genome Sequences (WGS)

evd_dna <- read.FASTA(here::here("data", "phm_evd_wgs.fa"))

evd_dna
## 50 DNA sequences in binary format stored in a list.
## 
## All sequences of same length: 18958 
## 
## Labels:
## 39e9dc
## 664549
## b4d8aa
## 51883d
## 947e40
## 9aa197
## ...
## 
## Base composition:
##     a     c     g     t 
## 0.250 0.248 0.248 0.254 
## (Total: 947.9 kb)
identical(labels(evd_dna), evd_linelist$case_id)
## [1] TRUE
neighbour_join <- nj(dist.dna(evd_dna, model = "N"))

neighbour_join
## 
## Phylogenetic tree with 50 tips and 48 internal nodes.
## 
## Tip labels:
##   39e9dc, 664549, b4d8aa, 51883d, 947e40, 9aa197, ...
## 
## Unrooted; includes branch lengths.
neighbour_join <- root(neighbour_join, 1)

plot(neighbour_join, main = "Neighbour Joining tree")
axisPhylo()

Here, our index case is 39e9dc, but we can see a lot of branches of length 0, which could be due to a number of reasons. For example, cases 947e40 and 601d2e could have been infected by an individual outside of out sample, or they could have been infected by 39e9dc and didn’t experience any genetic substitution. As a result of these branches of length 0, it is hard to identify the source of infections for a large number of individuals.

Building delay distributions

incub_mu <- 10.6
incub_sd <- 7.4
incub_cv <- incub_sd / incub_mu

incub_params <- epitrix::gamma_mucv2shapescale(incub_mu, incub_cv)
evd_incubation <- distcrete::distcrete(
  name = "gamma",
  shape = incub_params$shape,
  scale = incub_params$scale,
  w = 0.5,
  interval = 1
)

evd_incubation
## A discrete distribution
##   name: gamma
##   parameters:
##     shape: 2.0518626734843
##     scale: 5.16603773584906
tibble(days = 0:30, prob = evd_incubation$d(0:30)) %>%
  ggplot(aes(x = days, y = prob)) +
  geom_col() +
  labs(
    title = "Incubation period distribution",
    x = "Days from infection to onset of symptoms",
    y = "Probability"
  )

tibble(days = 0:50, prob = evd_si$distribution$d(0:50)) %>%
  ggplot(aes(x = days, y = prob)) +
  geom_col() +
  labs(
    title = "Serial Interval distribution",
    x = "Days between onset of symptoms in primary and secondary cases",
    y = "Probability"
  )

Using the original outbreaker model

outbreak_data <- outbreaker_data(
  dates = evd_linelist$date_of_onset,
  dna = unname(evd_dna),
  w_dens = evd_si$distribution$d(1:100),
  f_dens = evd_incubation$d(1:100)
)
outbreak_config <- create_config(
  move_kappa = FALSE,
  move_pi = FALSE,
  init_pi = 1,
  find_import = FALSE,
  init_tree = "star"
)
set.seed(0)
basic_result <- outbreaker(data = outbreak_data, config = outbreak_config)

basic_result
## 
## 
##  ///// outbreaker results ///
## 
## class:  outbreaker_chains data.frame
## dimensions 201 rows,  158 columns
## ancestries not shown: alpha_1 - alpha_50
## infection dates not shown: t_inf_1 - t_inf_50
## intermediate generations not shown: kappa_1 - kappa_50
## 
## /// head //
## data.frame [3, 8] 
## step   dbl 1 50 100
## post   dbl -1959.847683 -814.859065 -818.448079
## like   dbl -1962.150168 -817.161595 -820.750622
## prior  dbl 2.302485 2.30253 2.302543
## mu     dbl 0.0001 0.000055 0.000042
## pi     dbl 1 1 1
## eps    dbl 0.5 0.5 0.5
## lambda dbl 0.05 0.05 0.05 
## 
## ...
## /// tail //
## data.frame [3, 8] 
## step   dbl 9900 9950 10000
## post   dbl -806.40595 -805.818206 -807.969237
## like   dbl -808.70848 -808.12075 -810.271772
## prior  dbl 2.30253 2.302544 2.302535
## mu     dbl 0.000055 0.000041 0.00005
## pi     dbl 1 1 1
## eps    dbl 0.5 0.5 0.5
## lambda dbl 0.05 0.05 0.05
plot(basic_result)

outbreak_plot_type <- c(
  "trace", "density", "alpha", "t_inf"
  )

map(
  .x = outbreak_plot_type,
  .f = function(.x) {
    if (.x == "density") {
      plot(basic_result, "mu", type = .x, burn = 500)
    } else {
      plot(basic_result, type = .x, burn = 500)
    }
  }
)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

plot(basic_result, , type = "network", burn = 500, min_support = 0.05)
basic_res_summary <- summary(basic_result)
basic_res_summary
## $step
##    first     last interval  n_steps 
##        1    10000       50      201 
## 
## $post
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1959.8  -817.6  -813.1  -819.0  -808.7  -798.9 
## 
## $like
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1962.2  -819.9  -815.4  -821.3  -811.0  -801.2 
## 
## $prior
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.302   2.303   2.303   2.303   2.303   2.303 
## 
## $mu
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 3.400e-05 4.601e-05 5.114e-05 5.123e-05 5.610e-05 1.000e-04 
## 
## $pi
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1 
## 
## $tree
##    from to time   support generations
## 1    NA  1  -10 0.2089552          NA
## 2     1  2   -1 0.4577114           1
## 3     1  3    1 0.3830846           1
## 4     1  4   -2 0.3532338           1
## 5     1  5   -4 0.5422886           1
## 6     4  6    4 0.4726368           1
## 7     3  7    7 0.7512438           1
## 8     3  8    6 0.8706468           1
## 9     4  9    3 0.5820896           1
## 10    1 10   -2 0.5074627           1
## 11    1 11    4 0.3532338           1
## 12    1 12    5 0.3930348           1
## 13    4 13    8 0.4975124           1
## 14    6 14   10 0.6666667           1
## 15    5 15   11 0.3781095           1
## 16    4 16   12 0.3233831           1
## 17    6 17   16 0.3134328           1
## 18    6 18   12 0.4726368           1
## 19    6 19   17 0.2835821           1
## 20    6 20   17 0.2636816           1
## 21    6 21   16 0.3233831           1
## 22    6 22   15 0.3383085           1
## 23    4 23   14 0.3034826           1
## 24    9 24   18 0.2587065           1
## 25    6 25   19 0.2039801           1
## 26    8 26   18 0.5920398           1
## 27    6 27   18 0.2487562           1
## 28   33 28   21 0.2935323           1
## 29    9 29   17 0.3134328           1
## 30    8 30   22 0.3582090           1
## 31   13 31   20 0.2338308           1
## 32   30 32   25 0.5820896           1
## 33   28 33   25 0.7014925           1
## 34   20 34   24 0.8308458           1
## 35    8 35   22 0.4875622           1
## 36    8 36   20 0.6119403           1
## 37   27 37   28 0.1691542           1
## 38   34 38   32 0.6567164           1
## 39   11 39   21 0.9552239           1
## 40   14 40   25 0.1990050           1
## 41   15 41   24 0.9552239           1
## 42    3 42   26 0.4975124           1
## 43   41 43   32 0.6417910           1
## 44   35 44   30 0.7910448           1
## 45   22 45   28 0.1691542           1
## 46   30 46   32 0.4278607           1
## 47   34 47   31 0.8358209           1
## 48   40 48   29 0.1890547           1
## 49   32 49   36 0.3333333           1
## 50   40 50   34 0.1741294           1
basic_res_summary$tree  %>%
  ggplot(aes(x = support)) +
  geom_histogram(fill = "grey40", color = "white", bins = 12) +
  scale_x_continuous(limits = c(0, 1)) +
  labs(
    title = "Consesus ancestry: support"
  )
## Warning: Removed 2 rows containing missing values (geom_bar).

outbreak_temp_data <- outbreaker_data(
  dates = evd_linelist$date_of_onset,
  w_dens = evd_si$distribution$d(1:100),
  f_dens = evd_incubation$d(1:100)
)
set.seed(0)
basic_temp_result <- outbreaker(
  data = outbreak_temp_data, config = outbreak_config
  )

basic_temp_result
## 
## 
##  ///// outbreaker results ///
## 
## class:  outbreaker_chains data.frame
## dimensions 201 rows,  158 columns
## ancestries not shown: alpha_1 - alpha_50
## infection dates not shown: t_inf_1 - t_inf_50
## intermediate generations not shown: kappa_1 - kappa_50
## 
## /// head //
## data.frame [3, 8] 
## step   dbl 1 50 100
## post   dbl -365.679658 -300.076884 -295.882138
## like   dbl -367.982143 -302.379369 -298.184623
## prior  dbl 2.302485 2.302485 2.302485
## mu     dbl 0.0001 0.0001 0.0001
## pi     dbl 1 1 1
## eps    dbl 0.5 0.5 0.5
## lambda dbl 0.05 0.05 0.05 
## 
## ...
## /// tail //
## data.frame [3, 8] 
## step   dbl 9900 9950 10000
## post   dbl -305.136006 -303.676142 -308.474933
## like   dbl -307.438491 -305.978627 -310.777418
## prior  dbl 2.302485 2.302485 2.302485
## mu     dbl 0.0001 0.0001 0.0001
## pi     dbl 1 1 1
## eps    dbl 0.5 0.5 0.5
## lambda dbl 0.05 0.05 0.05
plot(basic_temp_result, type = "alpha")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.

Adding contact data to the reconstruction process

contact_data <- matrix(
  match(
    unlist(evd_chains$contacts),
    evd_linelist$case_id
    ),
  ncol = 2)

dim(contact_data)
## [1] 36  2
outbreak_cont_data <- outbreaker_data(
  dates = evd_linelist$date_of_onset,
  dna = unname(evd_dna),
  ctd = contact_data,
  w_dens = evd_si$distribution$d(1:100),
  f_dens = evd_incubation$d(1:100)
)
set.seed(0)
full_result <- outbreaker(
  data = outbreak_cont_data, config = outbreak_config
  )

full_result
## 
## 
##  ///// outbreaker results ///
## 
## class:  outbreaker_chains data.frame
## dimensions 201 rows,  158 columns
## ancestries not shown: alpha_1 - alpha_50
## infection dates not shown: t_inf_1 - t_inf_50
## intermediate generations not shown: kappa_1 - kappa_50
## 
## /// head //
## data.frame [3, 8] 
## step   dbl 1 50 100
## post   dbl -2133.492486 -879.824167 -842.566586
## like   dbl -2135.794972 -882.126696 -844.869121
## prior  dbl 2.302485 2.302529 2.302535
## mu     dbl 0.0001 0.000056 0.00005
## pi     dbl 1 1 1
## eps    dbl 0.5 0.58305 0.844773
## lambda dbl 0.05 0.009765 0.002261 
## 
## ...
## /// tail //
## data.frame [3, 8] 
## step   dbl 9900 9950 10000
## post   dbl -840.629255 -836.36688 -842.49067
## like   dbl -842.931782 -838.669416 -844.793195
## prior  dbl 2.302527 2.302537 2.302524
## mu     dbl 0.000058 0.000049 0.000061
## pi     dbl 1 1 1
## eps    dbl 0.771107 0.665784 0.596004
## lambda dbl 0.001723 0.001083 0.002275
map(
  .x = outbreak_plot_type,
  .f = function(.x) {
    if (.x == "density") {
      plot(full_result, "mu", type = .x, burn = 500)
    } else {
      plot(full_result, type = .x, burn = 500)
    }
  }
)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

plot(full_result, , type = "network", burn = 500, min_support = 0.05)
full_res_summary <- summary(full_result)
full_res_summary
## $step
##    first     last interval  n_steps 
##        1    10000       50      201 
## 
## $post
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2133.5  -845.4  -841.7  -848.4  -838.0  -831.1 
## 
## $like
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2135.8  -847.7  -844.0  -850.7  -840.3  -833.4 
## 
## $prior
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.302   2.303   2.303   2.303   2.303   2.303 
## 
## $mu
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 3.195e-05 4.496e-05 5.051e-05 5.085e-05 5.635e-05 1.000e-04 
## 
## $pi
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1 
## 
## $tree
##    from to time   support generations
## 1    NA  1   -8        NA          NA
## 2     1  2    1 0.6517413           1
## 3     1  3    1 1.0000000           1
## 4     1  4   -1 0.6616915           1
## 5     1  5   -1 1.0000000           1
## 6     4  6    5 0.6716418           1
## 7     3  7    6 0.9950249           1
## 8     3  8    7 0.9950249           1
## 9     4  9    4 0.9950249           1
## 10    1 10    0 1.0000000           1
## 11    1 11    6 0.9950249           1
## 12    1 12    3 0.9900498           1
## 13    4 13   10 0.9950249           1
## 14    6 14    9 0.9950249           1
## 15    1 15    9 1.0000000           1
## 16    4 16   11 0.9950249           1
## 17   14 17   16 0.9950249           1
## 18    6 18   16 0.4228856           1
## 19    6 19   15 0.9950249           1
## 20    6 20   15 0.9900498           1
## 21   14 21   16 0.9950249           1
## 22    6 22   18 0.2537313           1
## 23    4 23   12 0.9900498           1
## 24    4 24   13 0.9950249           1
## 25    6 25   21 0.2437811           1
## 26    3 26   16 0.9900498           1
## 27   21 27   23 0.9950249           1
## 28    6 28   17 0.9950249           1
## 29   23 29   21 0.9950249           1
## 30    8 30   19 0.6169154           1
## 31   23 31   23 0.9950249           1
## 32   30 32   27 0.9950249           1
## 33   28 33   26 0.9950249           1
## 34   20 34   24 0.9950249           1
## 35    3 35   20 0.9950249           1
## 36    3 36   22 0.9950249           1
## 37   22 37   29 0.1641791           1
## 38   34 38   32 0.9900498           1
## 39   11 39   24 0.9950249           1
## 40   22 40   27 0.8805970           1
## 41   15 41   22 0.9950249           1
## 42    3 42   24 0.9950249           1
## 43   41 43   32 0.9900498           1
## 44   35 44   31 0.9950249           1
## 45   18 45   28 0.1641791           1
## 46   30 46   30 0.9900498           1
## 47   34 47   33 0.9950249           1
## 48    6 48   21 0.9950249           1
## 49   30 49   33 0.9900498           1
## 50   48 50   35 0.1691542           1
full_res_summary$tree  %>%
  ggplot(aes(x = support)) +
  geom_histogram(fill = "grey40", color = "white", bins = 12) +
  scale_x_continuous(limits = c(0, 1)) +
  labs(
    title = "Consesus ancestry: support"
  )
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

full_res_summary$tree$support <- round(full_res_summary$tree$support, 2)

evd_linelist$id <- 1:nrow(evd_linelist)
cons_tree <- make_epicontacts(
  evd_linelist,
  full_res_summary$tree[-1, ],
  id = "id",
  from = 1,
  to = 2,
  directed = TRUE
  )
support_pal <- colorRampPalette(
    c("#918D98", "#645877", "#423359", "#281449", "#1A0340")
)

age_pal <- colorRampPalette(
    c("#3288BD", "#ABDDA4", "#FDAE61", "#D53E4F")
)

cons_tree$linelist$age_class <- cut(
  cons_tree$linelist$age,
  breaks = c(0, 10, 20, 30, 40, 90),
  labels = c("0-10", "11-20", "21-30", "31-40", "41+")
  )
plot(
  cons_tree,
  node_color = "age_class",
  node_shape = "sex",
  shapes = c(male = "male", female = "female")
)